Information Gain Measure
Impurity
Entropy
Gini
Use the SOCR Neonatal Pain data to build and display a decision tree recursively partitioning the data using the provided features and attributes to split the data into clusters.
np <- np[ , -1]
Create two classes using variable Cluster
library(xml2)
library(rvest)
np<-read_html('http://wiki.socr.umich.edu/index.php/SOCR_Data_NIPS_InfantVitK_ShotData')
np<- html_table(html_nodes(np, "table")[[1]])
np$cl<-np$Cluster<mean(np$Cluster)
np$cl<-factor(np$cl, levels=c(T, F), labels = c("Cluster 1", "Cluster 2"))
np <- np[ , -1]
np
## # A tibble: 158 × 8
## Group_NC1_Interv2 Immediate `30_Sec_Later` `60_Sec_Later` `120_Sec_Later`
## <int> <int> <int> <int> <int>
## 1 1 6 7 6 2
## 2 1 5 1 2 0
## 3 1 7 6 6 7
## 4 1 3 7 3 0
## 5 1 7 5 6 0
## 6 1 6 6 6 2
## 7 1 7 7 6 0
## 8 1 6 7 0 0
## 9 1 5 0 4 0
## 10 1 7 7 7 6
## # ℹ 148 more rows
## # ℹ 3 more variables: Total_Cry_Time <int>, Cluster <int>, cl <fct>Create random training and test datasets
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(1234)
train_index <- sample(seq_len(nrow(np)), size = 0.8*nrow(np))
np_train <- np[train_index, ]
np_test <- np[-train_index, ]
np_train<- np_train %>%
rename( "thirty" = "30_Sec_Later","sixty"="60_Sec_Later","onetwenty"="120_Sec_Later" )
np_test<- np_test %>%
rename( "thirty" = "30_Sec_Later","sixty"="60_Sec_Later","onetwenty"="120_Sec_Later" )
prop.table(table(np_train$cl))
##
## Cluster 1 Cluster 2
## 0.531746 0.468254
prop.table(table(np_test$cl))
##
## Cluster 1 Cluster 2
## 0.4375 0.5625Train a decision tree model on the data, use C5.0
and rpart, separately
library(C50)
## Warning: package 'C50' was built under R version 4.2.3
set.seed(123)
np_model <- C5.0(np_train[,-c(7, 8)], np_train$cl)
np_model
##
## Call:
## C5.0.default(x = np_train[, -c(7, 8)], y = np_train$cl)
##
## Classification Tree
## Number of samples: 126
## Number of predictors: 6
##
## Tree size: 2
##
## Non-standard options: attempt to group attributes
summary(np_model)
##
## Call:
## C5.0.default(x = np_train[, -c(7, 8)], y = np_train$cl)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Apr 16 18:09:09 2023
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 126 cases (7 attributes) from undefined.data
##
## Decision tree:
##
## Group_NC1_Interv2 <= 1: Cluster 2 (60/1)
## Group_NC1_Interv2 > 1: Cluster 1 (66)
##
##
## Evaluation on training data (126 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 1( 0.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 66 1 (a): class Cluster 1
## 59 (b): class Cluster 2
##
##
## Attribute usage:
##
## 100.00% Group_NC1_Interv2
##
##
## Time: 0.0 secs
library(rpart)
np.rpart<-rpart(cl~., data=np_train[,-c(7)])
np.rpart
## n= 126
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 126 59 Cluster 1 (0.53174603 0.46825397)
## 2) Group_NC1_Interv2>=1.5 66 0 Cluster 1 (1.00000000 0.00000000) *
## 3) Group_NC1_Interv2< 1.5 60 1 Cluster 2 (0.01666667 0.98333333) *
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
rpart.plot(np.rpart, digits=3)
Evaluate the model performance and compare the C5.0
and rpart results
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
np_pred <- predict(np_model, np_test[ ,-c(7,8)])
confusionMatrix(table(np_pred, np_test$cl))
## Confusion Matrix and Statistics
##
##
## np_pred Cluster 1 Cluster 2
## Cluster 1 13 0
## Cluster 2 1 18
##
## Accuracy : 0.9688
## 95% CI : (0.8378, 0.9992)
## No Information Rate : 0.5625
## P-Value [Acc > NIR] : 2.612e-07
##
## Kappa : 0.936
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9286
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9474
## Prevalence : 0.4375
## Detection Rate : 0.4062
## Detection Prevalence : 0.4062
## Balanced Accuracy : 0.9643
##
## 'Positive' Class : Cluster 1
##
np.p<-predict(np.rpart, np_test,type = 'class')
confusionMatrix(table(np.p, np_test$cl))
## Confusion Matrix and Statistics
##
##
## np.p Cluster 1 Cluster 2
## Cluster 1 13 0
## Cluster 2 1 18
##
## Accuracy : 0.9688
## 95% CI : (0.8378, 0.9992)
## No Information Rate : 0.5625
## P-Value [Acc > NIR] : 2.612e-07
##
## Kappa : 0.936
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9286
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9474
## Prevalence : 0.4375
## Detection Rate : 0.4062
## Detection Prevalence : 0.4062
## Balanced Accuracy : 0.9643
##
## 'Positive' Class : Cluster 1
## Tune the parameter for rpart and evaluate again
set.seed(1234)
control = rpart.control(cp = 0.000, xxval = 100, minsplit = 2)
np_model= rpart(cl ~ ., data = np_train[ , -7], control = control)
data <- as.data.frame(np_model$cptable)
data$CP <- as.factor(data$CP)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(data = data, x = ~CP, y = ~xerror, type = 'scatter', mode='lines+markers',
name = 'Test', error_y = ~list(array = xstd, color = 'gray')) %>%
layout(title="Complexity Parameter vs. (CV) Error Rate")
library(rattle)
## Warning: package 'rattle' was built under R version 4.2.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
selected_tr <- prune(np_model, cp= np_model$cptable[which.min(np_model$cptable[,"xerror"]),"CP"])
fancyRpartPlot(selected_tr, cex = 1, caption = "rattle::fancyRpartPlot (NP Data)")
Make predictions on testing data and assess the prediction accuracy - report the confusion matrix
np_pred<-predict(np_model, np_test,type = 'class')
confusionMatrix(table(np_pred, np_test$cl))
## Confusion Matrix and Statistics
##
##
## np_pred Cluster 1 Cluster 2
## Cluster 1 14 0
## Cluster 2 0 18
##
## Accuracy : 1
## 95% CI : (0.8911, 1)
## No Information Rate : 0.5625
## P-Value [Acc > NIR] : 1.009e-08
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4375
## Detection Rate : 0.4375
## Detection Prevalence : 0.4375
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Cluster 1
##
##100% accuracyComment on the classification performance
#The Confusion Matrix shows prediction accuracy is 100% (95%CI:89.11%, 100%).
#Sensitivity and Specificity of 1 indicating no mistake in predicting TP & TF.Try to apply Random Forest classification and report the variables importance plot, predictions on testing data, and assess the prediction accuracy.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(1234)
rf.fit <- randomForest(cl ~ .,data=np_train[,-7],importance=TRUE,ntree=2000)
library(plotly)
imp_rf.fit <- importance(rf.fit)
x <- rownames(imp_rf.fit)
y <- imp_rf.fit[,3]
plot_ly(x = ~y, y = ~reorder(x,y), name = "Var.Imp", type = "bar") %>%
layout(title="RF Variable Importance Plot (Accuracy)",
xaxis=list(title="Importance (mean decrease in accuracy)"),
yaxis = list(title="Variables (Ordered)"))
y <- imp_rf.fit[,4]
plot_ly(x = ~y, y = ~reorder(x,y), name = "Var.Imp", type = "bar") %>%
layout(title="RF Variable Importance Plot (Gini)",
xaxis=list(title="Importance (Gini mean decrease)"),
yaxis = list(title="Variables (Ordered)"))
rf_pred<-predict(rf.fit, np_test, type = 'class')
confusionMatrix(table(rf_pred, np_test$cl))
## Confusion Matrix and Statistics
##
##
## rf_pred Cluster 1 Cluster 2
## Cluster 1 13 0
## Cluster 2 1 18
##
## Accuracy : 0.9688
## 95% CI : (0.8378, 0.9992)
## No Information Rate : 0.5625
## P-Value [Acc > NIR] : 2.612e-07
##
## Kappa : 0.936
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9286
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9474
## Prevalence : 0.4375
## Detection Rate : 0.4062
## Detection Prevalence : 0.4062
## Balanced Accuracy : 0.9643
##
## 'Positive' Class : Cluster 1
##
##Identical with c5.0 & rpart before tuning: The Confusion Matrix shows prediction accuracy is about 96.88% (95%CI:83.78%, 99.92%).